Objective

Predicting the average total number of trips per hour across the week for unknown stations (not trained).

Census Dataset Acquistion

#Importing the census data csv file
library(data.table)
census = fread("data/London_census.csv", header = T,sep = ",", dec = ".")
head(census)
##     WardCode       WardName              borough NESW AreaSqKm      lon
## 1: E05000026          Abbey Barking and Dagenham East      1.3 0.077935
## 2: E05000027         Alibon Barking and Dagenham East      1.4 0.148270
## 3: E05000028      Becontree Barking and Dagenham East      1.3 0.118957
## 4: E05000029 Chadwell Heath Barking and Dagenham East      3.4 0.139985
## 5: E05000030      Eastbrook Barking and Dagenham East      3.5 0.173581
## 6: E05000031       Eastbury Barking and Dagenham East      1.4 0.105683
##         lat IncomeScor LivingEnSc NoEmployee GrenSpace PopDen BornUK NotBornUK
## 1: 51.53971       0.27      42.76       7900      19.6 9884.6   5459      7327
## 2: 51.54559       0.28      27.96        800      22.4 7464.3   7824      2561
## 3: 51.55453       0.25      31.59       1100       3.0 8923.1   8075      3470
## 4: 51.58475       0.27      34.78       1700      56.4 2970.6   7539      2482
## 5: 51.55365       0.19      21.25       4000      51.1 3014.3   8514      1992
## 6: 51.53590       0.27      31.16       1000      18.1 8357.1   7880      3744
##    NoCTFtoH NoDwelling NoFlats NoHouses NoOwndDwel MedHPrice
## 1:      0.1       4733    3153     1600       1545    177000
## 2:      0.1       4045     574     3471       1849    160000
## 3:      0.1       4378     837     3541       2093    170000
## 4:      0.4       4050    1400     2662       2148    195000
## 5:      0.5       3976     742     3235       2646    191750
## 6:      0.0       4321     933     3388       1913    167250
dim(census)
## [1] 625  20
uniqueN(census)
## [1] 625
uniqueN(census,by="WardName")
## [1] 604
uniqueN(census,by="borough")
## [1] 33
uniqueN(census,by="NESW")
## [1] 5
uniqueN(census,by="WardCode")
## [1] 625

The census data has 625 rows and 20 variables. From unique variable analysis, we can use the WardCode as our primary key when joining tables

Census Data Preparation

library(gridExtra)
library(ggplot2)
p1 = ggplot(census,aes(NoEmployee))+geom_histogram(aes(fill=NESW),bins = 20)
p2 = ggplot(census,aes(MedHPrice))+geom_histogram(aes(fill=NESW),bins = 20)
grid.arrange(p1, p2, nrow=2)

The No of Employees and Median House prices have outliers which mainly in the central

#creating the population metric 
census$Pop = census$PopDen * census$AreaSqKm

#Below independent variables have high correlation and provide the same information
cor(census$NoDwelling,(census$NoFlats+census$NoHouses))
## [1] 0.9985431
cor(census$NoCTFtoH,census$MedHPrice)
## [1] 0.8169244
cor(census$Pop,(census$BornUK+census$NotBornUK))
## [1] 0.999896
#Droping the redundant columns
census[,c("PopDen","NoCTFtoH","NoDwelling","Pop")]=NULL

# Normalizing the skewed variables, NoEmployee and MedHPrices
census$NoEmployee=log(census$NoEmployee)
census$MedHPrice=log(census$MedHPrice)
p1 = ggplot(census,aes(NoEmployee))+geom_histogram(aes(fill=NESW),bins = 20)
p2 = ggplot(census,aes(MedHPrice))+geom_histogram(aes(fill=NESW),bins = 20)
grid.arrange(p1, p2, nrow=2)

Stations Data Acquisition

#Importing the stations data csv file
stations = fread("data/bike_stations.csv", header = T,sep = ",", dec = ".")
head(stations)
##    Station_ID Capacity Latitude Longitude                         Station_Name
## 1:          1       19 51.52916 -0.109970           River Street , Clerkenwell
## 2:          2       37 51.49961 -0.197574       Phillimore Gardens, Kensington
## 3:          3       32 51.52128 -0.084605 Christopher Street, Liverpool Street
## 4:          4       23 51.53006 -0.120973      St. Chad's Street, King's Cross
## 5:          5       27 51.49313 -0.156876        Sedding Street, Sloane Square
## 6:          6       18 51.51812 -0.144228       Broadcasting House, Marylebone
dim(stations)
## [1] 773   5

The stations data have 773 stations and 5 variables describing the location and capacity of each station.

Merging the census and stations data Preparation

#Visualizing the intersection of the census and stations
ggplot(census,aes(x=lon,y=lat))+geom_point(col="red")+
geom_point(data=stations,aes(x=Longitude,y=Latitude),col="blue",alpha=0.25)+labs(x = "Longitude", y = "Latitude", title = "London Ward centers in Red and Stations Coords in blue")

#Zooming in the intersection 
ggplot(census,aes(x=lon,y=lat))+geom_point(col="red")+
geom_point(data=stations,aes(x=Longitude,y=Latitude),col="blue",alpha=0.25)+
xlim(range(stations$Longitude))+ylim(range(stations$Latitude))+labs(x = "Longitude", y = "Latitude", title = "London Ward centers in Red and Stations Coords in blue")
## Warning: Removed 478 rows containing missing values (geom_point).

Using Kmeans with single iteration in order to find the nearest centroids which are the Ward centers coordinates that in trun will be assigned to the stations and hence the census and stations can have keys to merge.

# creating a single data table from census and stations data 
combined = rbind(census[,c("lon","lat")],stations[,c("Longitude","Latitude")],use.names=FALSE)

clusters = kmeans(combined,combined[1:625,],iter.max = 1)
## Warning: did not converge in 1 iteration
#Assigning the nearest wardcode to the station name
stations$Station_Name=census$WardCode[clusters$cluster[626:nrow(combined)]]

#unique selected centroids for the stations
c = unique(clusters$cluster[626:nrow(combined)])

Viusalizing the selected centroids

library(rgdal)
## Loading required package: sp
## rgdal: version: 1.4-8, (SVN revision 845)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.4.2, released 2019/06/28
##  Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/gdal
##  GDAL binary built with GEOS: FALSE 
##  Loaded PROJ.4 runtime: Rel. 5.2.0, September 15th, 2018, [PJ_VERSION: 520]
##  Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/proj
##  Linking to sp version: 1.3-2
library(sp)
library(rgeos)
## rgeos version: 0.5-2, (SVN revision 621)
##  GEOS runtime version: 3.7.2-CAPI-1.11.2 
##  Linking to sp version: 1.3-1 
##  Polygon checking: TRUE
#Using London_Ward shape file from london Government website in order to plot the ward boundaries
m = readOGR(dsn=path.expand("./"), "London_Ward")
## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/islamhasabo/GitHub/Machine-Learning/R/London-Bikes", layer: "London_Ward"
## with 657 features
## It has 6 fields
# Transforming the coordinates to matach our data
m.wgs84 = spTransform(m, CRS("+proj=longlat +datum=WGS84"))

# Ploting 
ggplot(m.wgs84)+geom_polygon(aes(x = long, y = lat, group = group), fill = "white", colour = "gray")+
xlim(range(stations$Longitude))+ylim(range(stations$Latitude))+
geom_point(data = census,aes(x=lon,y=lat),col="red")+
geom_point(data=stations,aes(x=Longitude,y=Latitude),col="blue",alpha=0.25)+
geom_point(data=census[c],aes(x=lon,y=lat),col="orange") +
labs(x = "Longitude", y = "Latitude", title = "Part of London Map with Ward boundaires",
subtitle = "Red dots for Ward Centers, Blue for Station Coords and Orange for the selected Ward Centers")
## Regions defined for each Polygons
## Warning: Removed 478 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing missing values (geom_point).

#merging the census and stations data by the cenus WardCode and the stations station_name which is #the WardCode for the chosen WardCenter
merge1 = merge(census, stations, by.x="WardCode",by.y ="Station_Name")

#Calculating the distances between the station coordinates and the selected ward centers
library(geosphere)
di= distHaversine(merge1[,c("lon","lat")],merge1[,c("Longitude","Latitude")],r = 6378)

#Considering only single station for each ward center in order to overcome the repeated census data and reduce the stations belonging to a wrong ward issues.
merge1$di = di
merge1 = merge1 [order(di)]
merge11 =merge1[!duplicated(WardCode)]
head(merge11[,c("WardCode","Station_ID")])
##     WardCode Station_ID
## 1: E05000623        609
## 2: E05000426        654
## 3: E05000392        442
## 4: E05000429        794
## 5: E05000391        559
## 6: E05000583        576

113 out of 773 have been selected. They are the ones with least distance to Ward Centers.

#Visualizing the final selected stations
ggplot(m.wgs84)+geom_polygon(aes(x = long, y = lat, group = group), fill = "white", colour = "gray")+
xlim(range(stations$Longitude))+ylim(range(stations$Latitude))+
geom_point(data = merge11,aes(x=Longitude,y=Latitude),col="blue",alpha=0.25)+
geom_point(data=census[c],aes(x=lon,y=lat),col="orange") +
labs(x = "Longitude", y = "Latitude", title = "Part of London Map with Ward boundaires",
subtitle = "Red dots for Ward Centers and Orange for the selected Ward Centers")
## Regions defined for each Polygons
## Warning: Removed 4 rows containing missing values (geom_point).

Journeys Data Acquisition

#Importing the journeys data scv file
journeys = fread("data/bike_journeys.csv", header = T,sep = ",", dec = ".")
head(journeys)
##    Journey_Duration Journey_ID End_Date End_Month End_Year End_Hour End_Minute
## 1:             2040        953       19         9       17       18          0
## 2:             1800      12581       19         9       17       15         21
## 3:             1140       1159       15         9       17       17          1
## 4:              420       2375       14         9       17       12         16
## 5:             1200      14659       13         9       17       19         33
## 6:             1320       2351       14         9       17       14         53
##    End_Station_ID Start_Date Start_Month Start_Year Start_Hour Start_Minute
## 1:            478         19           9         17         17           26
## 2:            122         19           9         17         14           51
## 3:            639         15           9         17         16           42
## 4:            755         14           9         17         12            9
## 5:            605         13           9         17         19           13
## 6:            514         14           9         17         14           31
##    Start_Station_ID
## 1:              251
## 2:              550
## 3:              212
## 4:              163
## 5:               36
## 6:              589
dim(journeys)
## [1] 1542844      14

1542844 obs with 14 features to describe each trip start, end and duration

Journeys Data Preparation

# considering the start attributes as indicatior for the demand

# Aggregate the number of trips per hour per station
j1 = journeys[,.N,by=.(Start_Station_ID,Start_Year,Start_Month,Start_Date,Start_Hour)]

# converting the data time attributes to the iso datetime format
d = ISOdatetime(j1$Start_Year+2000,j1$Start_Month,j1$Start_Date,j1$Start_Hour, min = 0, sec=0, tz = "")

#Extracting informative features like week of the year, day of the year,weekday and weekend
p = as.POSIXlt(d)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following object is masked from 'package:base':
## 
##     date
j1$Station=j1$Start_Station_ID
j1$Week=isoweek(p)
j1$Day = p$yday
j1$WeekDay=wday(p,label = TRUE,week_start = getOption("lubridate.week.start", 1))
j1$Hour=hour(p)
library(chron)
## 
## Attaching package: 'chron'
## The following objects are masked from 'package:lubridate':
## 
##     days, hours, minutes, seconds, years
j1$WeekEnd = as.integer(as.logical(is.weekend(p)))
summary(j1)
##  Start_Station_ID   Start_Year  Start_Month      Start_Date      Start_Hour   
##  Min.   :  1.0    Min.   :17   Min.   :8.000   Min.   : 1.00   Min.   : 0.00  
##  1st Qu.:187.0    1st Qu.:17   1st Qu.:8.000   1st Qu.: 7.00   1st Qu.: 9.00  
##  Median :381.0    Median :17   Median :8.000   Median :13.00   Median :14.00  
##  Mean   :395.1    Mean   :17   Mean   :8.376   Mean   :13.74   Mean   :13.45  
##  3rd Qu.:604.0    3rd Qu.:17   3rd Qu.:9.000   3rd Qu.:19.00   3rd Qu.:18.00  
##  Max.   :826.0    Max.   :17   Max.   :9.000   Max.   :31.00   Max.   :23.00  
##                                                                               
##        N              Station           Week            Day        WeekDay    
##  Min.   :  1.000   Min.   :  1.0   Min.   :31.00   Min.   :212.0   Mon:63809  
##  1st Qu.:  1.000   1st Qu.:187.0   1st Qu.:32.00   1st Qu.:224.0   Tue:78012  
##  Median :  2.000   Median :381.0   Median :34.00   Median :236.0   Wed:61465  
##  Mean   :  3.318   Mean   :395.1   Mean   :34.21   Mean   :236.4   Thu:69243  
##  3rd Qu.:  4.000   3rd Qu.:604.0   3rd Qu.:36.00   3rd Qu.:249.0   Fri:68325  
##  Max.   :182.000   Max.   :826.0   Max.   :38.00   Max.   :261.0   Sat:64433  
##                                                                    Sun:59681  
##       Hour          WeekEnd      
##  Min.   : 0.00   Min.   :0.0000  
##  1st Qu.: 9.00   1st Qu.:0.0000  
##  Median :14.00   Median :0.0000  
##  Mean   :13.45   Mean   :0.2669  
##  3rd Qu.:18.00   3rd Qu.:1.0000  
##  Max.   :23.00   Max.   :1.0000  
## 

The Aggregated trips per hour have skewed right distribution. The journeys happened in continuous 50 days across 8 weeks between 826 stations.

Time Series Analysis

# Reshaping the data to be a valide input for time series function
j2 = dcast(j1, Start_Year+Start_Month+Start_Date+Start_Hour~Start_Station_ID,value.var = "N")
# Imp
#Imputing the NA atriubutes due to no trips in some hours 
j2[is.na(j2)]=0

library(forecast)
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
##   method             from    
##   fitted.fracdiff    fracdiff
##   residuals.fracdiff fracdiff
library(seasonal)
library(rugarch)
## Loading required package: parallel
## 
## Attaching package: 'rugarch'
## The following object is masked from 'package:stats':
## 
##     sigma
# Visualizing the multi-seasonal time series for random station considering the daily and weekely seasons with hour time slots.
t = msts(j2$'129', seasonal.periods=c(24,168),ts.frequency=24,start=c(212,0))
#visualizing the time series
t %>% mstl(s.window="periodic", robust=TRUE) %>% autoplot(1) + xlab("Day")

#Forecasting using seasonal decomposition 
t %>%  stlf() %>% autoplot() + xlab("Day")

The seasonality across the one day interval is much stronger than one week interval. The Trend is almost steady. Forcasting the number of trips per hour per station can be done . However it varies from station to station which doesn’t meet our main objective.

#Visualizing the mean and sum of total trips per WeekDay of all stations over the total period of of our data, 50 days

j3 = j1[,.(sum(N),mean(N)),by=.(Station,Week,WeekDay,Day)]
p1 = ggplot(j3,aes(x=WeekDay,y=V1))+geom_col(fill="purple")+ylab("Sum")
p2 = ggplot(j3,aes(x=WeekDay,y=V2))+geom_col(fill="violet")+ylab("Mean")
grid.arrange(p1, p2, ncol=2)

Sum and Mean show simial distributing across the weekdays.

#Visualizing the mean total of trips per WeekDay of all stations in every week our period.

ggplot(j3,aes(x=WeekDay,y=V2))+geom_col(aes(fill = Week))+facet_wrap(~Week)

Some days like Tue in Week 31 and Wed in week 32 have significant variations from the rest of the days.

#Visualizing the mean of total no trips per WeekDay of all stations in every hour 
j4 = j1[,mean(N),.(Station,WeekDay,WeekEnd,Hour)]
ggplot(j4,aes(x=WeekDay,y=V1))+geom_col(aes(fill = Hour))+facet_wrap(~Hour)

WeekDay*Hour seems to have a high predictive power. Weekends have more number of trips in the off peak hours.

Merging the three datasets

# merging the three datasets
merge2 = merge(j4, merge11, by.x="Station",by.y ="Station_ID")
head(merge2)
##    Station WeekDay WeekEnd Hour        V1  WardCode   WardName borough    NESW
## 1:      12     Mon       0   12  3.857143 E05000129 Bloomsbury  Camden Central
## 2:      12     Mon       0   10  1.666667 E05000129 Bloomsbury  Camden Central
## 3:      12     Sun       1   16  4.285714 E05000129 Bloomsbury  Camden Central
## 4:      12     Wed       0   18 12.571429 E05000129 Bloomsbury  Camden Central
## 5:      12     Mon       0   18 14.571429 E05000129 Bloomsbury  Camden Central
## 6:      12     Thu       0   18 14.714286 E05000129 Bloomsbury  Camden Central
##    AreaSqKm       lon      lat IncomeScor LivingEnSc NoEmployee GrenSpace
## 1:        1 -0.131436 51.52199       0.11      54.65   11.14764       9.2
## 2:        1 -0.131436 51.52199       0.11      54.65   11.14764       9.2
## 3:        1 -0.131436 51.52199       0.11      54.65   11.14764       9.2
## 4:        1 -0.131436 51.52199       0.11      54.65   11.14764       9.2
## 5:        1 -0.131436 51.52199       0.11      54.65   11.14764       9.2
## 6:        1 -0.131436 51.52199       0.11      54.65   11.14764       9.2
##    BornUK NotBornUK NoFlats NoHouses NoOwndDwel MedHPrice Capacity Latitude
## 1:   5317      5575    5226      197       1293  12.80765       49 51.52168
## 2:   5317      5575    5226      197       1293  12.80765       49 51.52168
## 3:   5317      5575    5226      197       1293  12.80765       49 51.52168
## 4:   5317      5575    5226      197       1293  12.80765       49 51.52168
## 5:   5317      5575    5226      197       1293  12.80765       49 51.52168
## 6:   5317      5575    5226      197       1293  12.80765       49 51.52168
##    Longitude         di
## 1: -0.130431 0.07754604
## 2: -0.130431 0.07754604
## 3: -0.130431 0.07754604
## 4: -0.130431 0.07754604
## 5: -0.130431 0.07754604
## 6: -0.130431 0.07754604
dim(merge2)
## [1] 15827    26
length(unique(merge2$Station))
## [1] 113

After merging the 113 stations which have unique census data, all of them have recorded trips.

#Converting the NESW, weekday and hour  into categorical attributes.
merge2$WeekDay=factor(merge2$WeekDay,ordered = FALSE)
merge2$Hour=as.factor(merge2$Hour)
merge2$WeekEnd=as.factor(merge2$WeekEnd)
#dropping the non predictive columns and the redundant ones 
merge2[,c("WardCode","WardName","NESW","borough","lon","lat")]=NULL
#Removing the null values
merge2=merge2[complete.cases(merge2)]

Machine Learning input Data Preparation

#Standrdize the numerical independent features.
bike=scale(merge2[,c("AreaSqKm","IncomeScor","LivingEnSc","NoEmployee","GrenSpace","BornUK","NotBornUK","NoFlats","NoHouses","NoOwndDwel","MedHPrice","Capacity","Longitude","Latitude")])

#spliting the stations randomly betweem the training and test with prcentage 80 to 20
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:rgeos':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
set.seed(20)
bikeTrain = cbind(bike,merge2[,c("V1","WeekDay","WeekEnd","Hour","Station")])
bike1Train = bikeTrain%>%subset(bikeTrain$Station%in%sample(unique(bikeTrain$Station),0.8*length(unique(bikeTrain$Station))))
bikeTest  = cbind(bike,merge2[,c("V1","WeekDay","WeekEnd","Hour","Station")])
bike1Test  = bikeTest %>%subset(!bikeTest$Station%in%unique(bike1Train$Station))
length(unique(bike1Train$Station))
## [1] 90
length(unique(bike1Test$Station))
## [1] 23
#Dropping the Station ID which is not informative 
bike1Train$Station = NULL
bike1Test$Station=NULL

90 random stations will be used in training the models and other different 23 in testing

Modelling

Linear Regression

#Creating a new DataTable for Performance Comparison
perf = data.table()
# Linear Modelling with the signifianctly predictive census features  
linearModel = lm(V1 ~(WeekDay+Hour+AreaSqKm+IncomeScor+NoEmployee+GrenSpace+BornUK+NotBornUK+NoFlats+NoOwndDwel+MedHPrice+Capacity+Longitude+Latitude),data=bike1Train)

summary(linearModel)
## 
## Call:
## lm(formula = V1 ~ (WeekDay + Hour + AreaSqKm + IncomeScor + NoEmployee + 
##     GrenSpace + BornUK + NotBornUK + NoFlats + NoOwndDwel + MedHPrice + 
##     Capacity + Longitude + Latitude), data = bike1Train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.8996 -0.8894 -0.2642  0.4883 28.7164 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.49930    0.09279  16.159  < 2e-16 ***
## WeekDayTue   0.08021    0.05698   1.408 0.159219    
## WeekDayWed  -0.10623    0.05750  -1.847 0.064708 .  
## WeekDayThu   0.01764    0.05713   0.309 0.757492    
## WeekDayFri  -0.02070    0.05680  -0.364 0.715594    
## WeekDaySat   0.07249    0.05676   1.277 0.201531    
## WeekDaySun   0.04247    0.05710   0.744 0.457070    
## Hour1       -0.08556    0.12714  -0.673 0.500988    
## Hour2       -0.18876    0.13542  -1.394 0.163391    
## Hour3       -0.15973    0.14763  -1.082 0.279277    
## Hour4       -0.32693    0.15366  -2.128 0.033390 *  
## Hour5       -0.26748    0.12745  -2.099 0.035860 *  
## Hour6        0.20998    0.11216   1.872 0.061206 .  
## Hour7        1.28941    0.10989  11.734  < 2e-16 ***
## Hour8        2.80514    0.10895  25.748  < 2e-16 ***
## Hour9        1.16098    0.10870  10.681  < 2e-16 ***
## Hour10       0.71147    0.10882   6.538 6.49e-11 ***
## Hour11       0.87788    0.10885   8.065 8.01e-16 ***
## Hour12       1.09292    0.10865  10.059  < 2e-16 ***
## Hour13       1.13038    0.10879  10.390  < 2e-16 ***
## Hour14       1.06435    0.10886   9.778  < 2e-16 ***
## Hour15       1.13072    0.10872  10.400  < 2e-16 ***
## Hour16       1.37593    0.10872  12.656  < 2e-16 ***
## Hour17       2.15139    0.10856  19.818  < 2e-16 ***
## Hour18       2.03058    0.10859  18.700  < 2e-16 ***
## Hour19       1.16570    0.10869  10.725  < 2e-16 ***
## Hour20       0.54996    0.10889   5.051 4.47e-07 ***
## Hour21       0.28638    0.11018   2.599 0.009352 ** 
## Hour22       0.18774    0.11057   1.698 0.089558 .  
## Hour23       0.10552    0.11297   0.934 0.350288    
## AreaSqKm     0.11925    0.02452   4.863 1.17e-06 ***
## IncomeScor  -0.34732    0.04019  -8.642  < 2e-16 ***
## NoEmployee   0.13403    0.02476   5.413 6.33e-08 ***
## GrenSpace   -0.11647    0.02143  -5.434 5.60e-08 ***
## BornUK      -0.04829    0.03714  -1.300 0.193612    
## NotBornUK   -0.33145    0.02589 -12.804  < 2e-16 ***
## NoFlats      0.13098    0.02773   4.724 2.34e-06 ***
## NoOwndDwel  -0.10959    0.03365  -3.257 0.001130 ** 
## MedHPrice   -0.12104    0.03102  -3.901 9.61e-05 ***
## Capacity     0.21120    0.01614  13.087  < 2e-16 ***
## Longitude    0.08838    0.02551   3.464 0.000534 ***
## Latitude     0.31288    0.02230  14.029  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.7 on 12492 degrees of freedom
## Multiple R-squared:  0.2385, Adjusted R-squared:  0.236 
## F-statistic: 95.45 on 41 and 12492 DF,  p-value: < 2.2e-16
perf$lm_pred = predict(linearModel,bike1Test)
perf$lm_actual = (bike1Test[,"V1"])
library(Metrics)
## 
## Attaching package: 'Metrics'
## The following object is masked from 'package:forecast':
## 
##     accuracy
library(MLmetrics)
## 
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
## 
##     Recall
R2_Score(perf$lm_actual, perf$lm_pred)
## [1] -1.189744
RMSE(perf$lm_actual, perf$lm_pred)
## [1] 1.31405
head(perf$lm_pred)
## [1] 2.847810 2.865452 4.436044 4.342854 2.478746 1.738556
head(perf$lm_actual)
## [1] 2.000000 1.666667 1.000000 1.428571 1.666667 1.500000

Census Data has low impact in explaining the number of trips variaitons.

Linear Regression With Regularization (Ridge)

Input prepation to be a matrix instead of dataframe. x,y for training and x1,y1 for testing.

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:MLmetrics':
## 
##     MAE, RMSE
## The following objects are masked from 'package:Metrics':
## 
##     precision, recall
#Encoding the categorical attributes
dmy = dummyVars(~.,data=bike1Train)
trsf = data.frame(predict(dmy,newdata = bike1Train ))

# Creating new features from the encoded variables based on the logic gates concept to reflect the variation of peak hours and it’s relation with the weekend

#Holiday non peak hours
trsf$HNP = (trsf$WeekDay.Sat+trsf$WeekDay.Sun)*(trsf$Hour.7+trsf$Hour.8)

#Holiday peak hours
trsf$HP = (trsf$WeekDay.Sat+trsf$WeekDay.Sun)*(trsf$Hour.11+trsf$Hour.12+trsf$Hour.13+trsf$Hour.14)

# Non peak hours
trsf$NP = trsf$Hour.0+trsf$Hour.1+trsf$Hour.2+trsf$Hour.3+trsf$Hour.4+trsf$Hour.5

# peak hours
trsf$P = trsf$Hour.7+trsf$Hour.8+trsf$Hour.17+trsf$Hour.18

#Convert from DataFrame to Matrix
y <- as.matrix( trsf[, "V1"] )
x <- as.matrix( trsf[, names(trsf)[names(trsf)!="V1"]] )

Correlation

library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:MLmetrics':
## 
##     AUC
## The following object is masked from 'package:seasonal':
## 
##     outlier
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
mycorr = corr.test(trsf[,1:15],adjust = "fdr")
library(corrplot)
## corrplot 0.84 loaded
corr.test(trsf[,1:14],trsf[,15],adjust = "fdr")
## Call:corr.test(x = trsf[, 1:14], y = trsf[, 15], adjust = "fdr")
## Correlation matrix 
##             [,1]
## AreaSqKm    0.05
## IncomeScor -0.06
## LivingEnSc  0.06
## NoEmployee  0.16
## GrenSpace  -0.03
## BornUK     -0.09
## NotBornUK  -0.10
## NoFlats     0.01
## NoHouses   -0.08
## NoOwndDwel -0.08
## MedHPrice   0.05
## Capacity    0.11
## Longitude   0.03
## Latitude    0.13
## Sample Size 
## [1] 12534
## Probability values  adjusted for multiple tests. 
##            [,1]
## AreaSqKm   0.00
## IncomeScor 0.00
## LivingEnSc 0.00
## NoEmployee 0.00
## GrenSpace  0.00
## BornUK     0.00
## NotBornUK  0.00
## NoFlats    0.29
## NoHouses   0.00
## NoOwndDwel 0.00
## MedHPrice  0.00
## Capacity   0.00
## Longitude  0.00
## Latitude   0.00
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
corrplot(mycorr$r,order = "hclust")

No of Employee has a agood correlation with the no of trips (V1)

#Searching for optimum lambda that results in least cost 
lambdas_to_try <- 10^seq(-3, 3, length.out = 100)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 3.0-2
# Tuning lambda with 10 folds cross validation
ridge_cv <- cv.glmnet(x, y, alpha = 0, lambda = lambdas_to_try, standardize = FALSE, nfolds = 10)

# ploting lambda versus the mean square error
plot(ridge_cv)

lambda_ridge_cv <- ridge_cv$lambda.min

#Model training using the optimum lambda
model_ridge <- glmnet(x, y,  alpha = 0, lambda = lambda_ridge_cv,standardize = FALSE)


#training prediction
y_hat_ridge <- predict(model_ridge, x)

#RSquared for training 
rsq_ridge <- cor(y, y_hat_ridge)^2
print(rsq_ridge)
##             s0
## [1,] 0.2988686
#Same data preparation for testing
trsf1 = data.frame(predict(dmy,newdata = bike1Test ))
trsf1$HNP = (trsf1$WeekDay.Sat+trsf1$WeekDay.Sun)*(trsf1$Hour.7+trsf1$Hour.8)
trsf1$HP = (trsf1$WeekDay.Sat+trsf1$WeekDay.Sun)*(trsf1$Hour.11+trsf1$Hour.12+trsf1$Hour.13+trsf1$Hour.14)
trsf1$NP = trsf1$Hour.0+trsf1$Hour.1+trsf1$Hour.2+trsf1$Hour.3+trsf1$Hour.4+trsf1$Hour.5
trsf1$P = trsf1$Hour.7+trsf1$Hour.8+trsf1$Hour.17+trsf1$Hour.18
y1 <- as.matrix( trsf1[, "V1"] )
x1 <- as.matrix( trsf1[, names(trsf1)[names(trsf1)!="V1"]] )

#testing prediction
y1_hat_ridge <-predict(model_ridge, x1)

#RSquared for testing 
rsq_ridge <- cor(y1, y1_hat_ridge)^2
print(rsq_ridge)
##             s0
## [1,] 0.2673336
head(y1)
##          [,1]
## [1,] 2.000000
## [2,] 1.666667
## [3,] 1.000000
## [4,] 1.428571
## [5,] 1.666667
## [6,] 1.500000
head(y1_hat_ridge)
##         s0
## 1 3.541375
## 2 3.559652
## 3 2.313968
## 4 5.106241
## 5 3.319669
## 6 1.735085
#Beta Coefficients 
model_ridge$beta
## 51 x 1 sparse Matrix of class "dgCMatrix"
##                       s0
## AreaSqKm     0.071735721
## IncomeScor  -0.313027263
## LivingEnSc   0.015515343
## NoEmployee   0.149351741
## GrenSpace   -0.086053833
## BornUK      -0.163430636
## NotBornUK   -0.398276136
## NoFlats      0.286676793
## NoHouses     0.188118834
## NoOwndDwel  -0.168201685
## MedHPrice   -0.164413821
## Capacity     0.209120897
## Longitude    0.077509735
## Latitude     0.314660721
## WeekDay.Mon -0.009858863
## WeekDay.Tue  0.070621382
## WeekDay.Wed -0.117521200
## WeekDay.Thu  0.008417847
## WeekDay.Fri -0.029776990
## WeekDay.Sat  0.065361005
## WeekDay.Sun  0.013208900
## WeekEnd.0   -0.029448943
## WeekEnd.1    0.005965379
## Hour.0       0.046514242
## Hour.1      -0.042270905
## Hour.2      -0.142138979
## Hour.3      -0.114174224
## Hour.4      -0.275590981
## Hour.5      -0.207268627
## Hour.6      -0.528794420
## Hour.7      -0.007959998
## Hour.8       1.576824230
## Hour.9       0.408843040
## Hour.10     -0.036600887
## Hour.11     -0.209933457
## Hour.12      0.006099475
## Hour.13      0.045265874
## Hour.14     -0.025259011
## Hour.15      0.378272257
## Hour.16      0.621496122
## Hour.17      0.153398493
## Hour.18      0.033981591
## Hour.19      0.413072067
## Hour.20     -0.196834562
## Hour.21     -0.456833745
## Hour.22     -0.556728841
## Hour.23     -0.637261984
## HNP         -2.922824921
## HP           1.171539151
## NP          -0.797240189
## P            1.249754031

Interpretation

Number of trips increase in peak hours and decrease in off peaks. The geographic location for stations and ward center also have high impact on the number of trips.

Number of trips as expected increase with in areas with high Number of employees. Also, people who are not born in UK seem not to get used to ride bikes.

The Newly Engineered features have significant impact. Particularly the Weekend non peak hours(HNP).

set.seed(10)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:seasonal':
## 
##     outlier
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:gridExtra':
## 
##     combine
bike.rf = randomForest(x,y)
## Warning in rfout$mse/(var(y) * (n - 1)/n): Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
summary(bike.rf)
##                 Length Class  Mode     
## call                3  -none- call     
## type                1  -none- character
## predicted       12534  -none- numeric  
## mse               500  -none- numeric  
## rsq               500  -none- numeric  
## oob.times       12534  -none- numeric  
## importance         51  -none- numeric  
## importanceSD        0  -none- NULL     
## localImportance     0  -none- NULL     
## proximity           0  -none- NULL     
## ntree               1  -none- numeric  
## mtry                1  -none- numeric  
## forest             11  -none- list     
## coefs               0  -none- NULL     
## y               12534  -none- numeric  
## test                0  -none- NULL     
## inbag               0  -none- NULL
perf$rf_pred = predict(bike.rf,x1)
perf$rf_actual = (bike1Test[,"V1"])
library(Metrics)
library(MLmetrics)
R2_Score(perf$rf_actual, perf$rf_pred)
## [1] -0.09660702
RMSE(perf$rf_actual, perf$rf_pred)
## [1] 1.222165
head(perf$rf_pred)
## [1] 2.868576 3.173746 1.352768 5.315107 2.841447 1.645584
head(perf$rf_actual)
## [1] 2.000000 1.666667 1.000000 1.428571 1.666667 1.500000
bike.rf$importance
##             IncNodePurity
## AreaSqKm       1126.78503
## IncomeScor      836.35934
## LivingEnSc     1240.07743
## NoEmployee     3268.23550
## GrenSpace       959.97117
## BornUK         1110.33004
## NotBornUK      2217.09415
## NoFlats        1065.42380
## NoHouses       1496.89526
## NoOwndDwel     1107.23894
## MedHPrice      1214.14197
## Capacity       1879.11748
## Longitude      1139.16170
## Latitude       1815.61659
## WeekDay.Mon     375.74431
## WeekDay.Tue     410.66565
## WeekDay.Wed     357.70574
## WeekDay.Thu     340.27800
## WeekDay.Fri     351.48297
## WeekDay.Sat     409.64912
## WeekDay.Sun     458.50916
## WeekEnd.0      1332.96596
## WeekEnd.1      1315.29196
## Hour.0           47.51124
## Hour.1           40.18374
## Hour.2           58.74811
## Hour.3           23.72540
## Hour.4           22.20557
## Hour.5           34.31158
## Hour.6          453.08315
## Hour.7         1079.16587
## Hour.8         2168.96918
## Hour.9          577.18885
## Hour.10         211.60929
## Hour.11         182.37521
## Hour.12         225.44181
## Hour.13         247.40891
## Hour.14         261.41202
## Hour.15         583.54815
## Hour.16         979.31344
## Hour.17        2108.07515
## Hour.18         753.99875
## Hour.19         404.17161
## Hour.20         181.92696
## Hour.21         293.61683
## Hour.22         296.52897
## Hour.23         337.18083
## HNP             850.68423
## HP              984.41358
## NP             1473.87563
## P              3049.38250

Hours and weekdays are the most informative attributes in decreasing the impurities. Number of Employee have a significant impact as well on the average of number of trips per hour per station .

Summary

Ridge has better performance with R2=0.26 on the testing set from linear model. Also, Random Forest has a good performance with R2=-0.09 and RMSE 1.2 on testing .